In [104]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from seaborn.palettes import color_palette
import warnings
warnings.filterwarnings('ignore')
random.seed(950)

1. Load the dataset¶

In [2]:
df_g06 = pd.read_csv('creditcard.csv')
In [3]:
df_g06.shape
Out[3]:
(284807, 31)
In [4]:
df_g06.Class.value_counts()
Out[4]:
Class
0    284315
1       492
Name: count, dtype: int64

2. Show first 6 data points¶

In [9]:
df_g06.head(6)
Out[9]:
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 149.62 0
1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 2.69 0
2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 378.66 0
3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 123.50 0
4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 69.99 0
5 2.0 -0.425966 0.960523 1.141109 -0.168252 0.420987 -0.029728 0.476201 0.260314 -0.568671 ... -0.208254 -0.559825 -0.026398 -0.371427 -0.232794 0.105915 0.253844 0.081080 3.67 0

6 rows × 31 columns

3. Describe pandas Dataframe¶

In [10]:
df_g06.describe()
Out[10]:
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
count 284807.000000 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 ... 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 284807.000000 284807.000000
mean 94813.859575 1.168375e-15 3.416908e-16 -1.379537e-15 2.074095e-15 9.604066e-16 1.487313e-15 -5.556467e-16 1.213481e-16 -2.406331e-15 ... 1.654067e-16 -3.568593e-16 2.578648e-16 4.473266e-15 5.340915e-16 1.683437e-15 -3.660091e-16 -1.227390e-16 88.349619 0.001727
std 47488.145955 1.958696e+00 1.651309e+00 1.516255e+00 1.415869e+00 1.380247e+00 1.332271e+00 1.237094e+00 1.194353e+00 1.098632e+00 ... 7.345240e-01 7.257016e-01 6.244603e-01 6.056471e-01 5.212781e-01 4.822270e-01 4.036325e-01 3.300833e-01 250.120109 0.041527
min 0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00 -1.137433e+02 -2.616051e+01 -4.355724e+01 -7.321672e+01 -1.343407e+01 ... -3.483038e+01 -1.093314e+01 -4.480774e+01 -2.836627e+00 -1.029540e+01 -2.604551e+00 -2.256568e+01 -1.543008e+01 0.000000 0.000000
25% 54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01 -6.915971e-01 -7.682956e-01 -5.540759e-01 -2.086297e-01 -6.430976e-01 ... -2.283949e-01 -5.423504e-01 -1.618463e-01 -3.545861e-01 -3.171451e-01 -3.269839e-01 -7.083953e-02 -5.295979e-02 5.600000 0.000000
50% 84692.000000 1.810880e-02 6.548556e-02 1.798463e-01 -1.984653e-02 -5.433583e-02 -2.741871e-01 4.010308e-02 2.235804e-02 -5.142873e-02 ... -2.945017e-02 6.781943e-03 -1.119293e-02 4.097606e-02 1.659350e-02 -5.213911e-02 1.342146e-03 1.124383e-02 22.000000 0.000000
75% 139320.500000 1.315642e+00 8.037239e-01 1.027196e+00 7.433413e-01 6.119264e-01 3.985649e-01 5.704361e-01 3.273459e-01 5.971390e-01 ... 1.863772e-01 5.285536e-01 1.476421e-01 4.395266e-01 3.507156e-01 2.409522e-01 9.104512e-02 7.827995e-02 77.165000 0.000000
max 172792.000000 2.454930e+00 2.205773e+01 9.382558e+00 1.687534e+01 3.480167e+01 7.330163e+01 1.205895e+02 2.000721e+01 1.559499e+01 ... 2.720284e+01 1.050309e+01 2.252841e+01 4.584549e+00 7.519589e+00 3.517346e+00 3.161220e+01 3.384781e+01 25691.160000 1.000000

8 rows × 31 columns

4. Show correlation heat plot¶

In [5]:
# compute the correlation matrix
cormat_g06 = df_g06.corr()
In [6]:
# create the heat plot

plt.figure(figsize=(12, 12))
cmap = sns.diverging_palette(240, 20, as_cmap=True)
sns.heatmap(cormat_g06, cmap=cmap, square=True, annot=False )
plt.title('Correlation heat plot')

plt.show()

According to this plot, we can observe that the correlations among V1-V28 are almost 0, which really makes sense because these variables come from PCA.

5. Show the scatterplot matrix¶

Since there are huge amount of data in the dataset, creating a scatter matrix for the entire dataset does not seem posible for my computer. Thus, I only create the scatter matrix for first five variables.

In [15]:
import plotly.express as px
import copy

df_forplot_g06 = copy.deepcopy(df_g06)
df_forplot_g06['Class'] = df_forplot_g06['Class'].astype(str)

fig = px.scatter_matrix(
    df_forplot_g06,
    dimensions=df_forplot_g06.columns[1:6],
    color='Class',
    title="Scatter Matrix Plot",
    color_discrete_map={'1': 'red', '0': 'blue'},
    opacity=0.6,
    height=800,
    width=800
)
fig.update_layout(
    title={
    'text': 'Scatter Matrix Plot',
    'x':0.5,
    'xanchor': 'center',
    'yanchor': 'top'})

fig.show()

Save to png file.

In [17]:
import plotly.io as pio

pio.write_image(fig, 'Scatter_Matrix.png')

6. Split the dataset into the training and test sets¶

In [19]:
from sklearn.model_selection import train_test_split

X_g06 = df.drop('Class', axis=1)
y_g06 = df['Class']

X_train_g06, X_test_g06, y_train_g06, y_test_g06 = train_test_split(X_g06, y_g06, test_size=0.2, random_state=950)

In this task, I choose 80%-20% split proportion for training and test sets, because there are more than 280000 rows in the dataset and 20% of the data points are enough for testing.

7. Perform classification routine by different machine learning models¶

In [20]:
# First normalize the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_std_g06 = scaler.fit_transform(X_train_g06)
X_test_std_g06 = scaler.transform(X_test_g06)
In [22]:
# import relevant libraries
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier, RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
In [23]:
# define the base models

def base_models():
    models = dict()
    models["LR"] = LogisticRegression()
    models["KNN"] = KNeighborsClassifier()
    models["Tree"] = DecisionTreeClassifier()
    models["SVM"] = SVC()
    models["NB"] = GaussianNB()
    models["Random Forest"] = RandomForestClassifier()
    models["Bagging"] = BaggingClassifier()
    models["GBM"] = GradientBoostingClassifier()
    models["XGB"] = XGBClassifier()
    return models
In [24]:
models_g06 = base_models()
In [43]:
# define a function to evaluate the model results

def eval_models(model):
    scores = cross_val_score(model, X_train_std_g06, y_train_g06, cv=10, scoring='recall_macro', n_jobs=-1)
    return scores

In this task, since the data is extremely unbalanced, accuracy is kind of meaningless. Furthermore, the cost of missing a positive instance (a fraud transaction) is high. Thus, I choose recall as the metrix to evaluate the models.

In [45]:
results_g06, names_g06 = list(), list()
In [46]:
# run the models and get the cross-validation results

for name, clf in models_g06.items():
    scores = eval_models(clf)
    results_g06.append(scores)
    names_g06.append(name)
    print("-----------------", '\n')
    print('Average recall of %s is: %.3f' % (name, scores.mean()), '\n')
----------------- 

Average recall of LR is: 0.817 

----------------- 

Average recall of KNN is: 0.888 

----------------- 

Average recall of Tree is: 0.878 

/Users/Administrator/anaconda3/lib/python3.11/site-packages/joblib/externals/loky/process_executor.py:700: UserWarning:

A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.

----------------- 

Average recall of SVM is: 0.833 

----------------- 

Average recall of NB is: 0.907 

----------------- 

Average recall of Random Forest is: 0.894 

----------------- 

Average recall of Bagging is: 0.887 

----------------- 

Average recall of GBM is: 0.783 

----------------- 

Average recall of XGB is: 0.896 

In [116]:
# convert the result list to dataframe for visualization

resultdf_g06 = pd.DataFrame(np.transpose(results_g06), columns = ["LR","KNN","Tree","SVM","NB","Random Forest","Bagging", "GBM", "XGB"])
resultdf_g06 = pd.melt(resultdf_g06.reset_index(), id_vars='index',value_vars=["LR","KNN","Tree","SVM","NB","Random Forest","Bagging", "GBM", "XGB"])
In [117]:
# create a boxplot to display the model performance

from plotly.subplots import make_subplots
import plotly.graph_objs as go

fig = px.box(resultdf_g06, x="variable", y="value",color="variable",points='all',
labels={"variable": "Machine Learning Model",
        "value": "Recall"
        },title="Model Performance")
fig.update_layout(
    title={
    'x':0.45,
    'xanchor': 'center',
    'yanchor': 'top'})
fig.show()
In [118]:
pio.write_image(fig, 'Boxplot1.png')

According to this plot, we can observe that Naive Bayes has highest average recall, followed by Random Forest, while GBM perform worst (perhaps because I have not carefully tuned the parameters).

8. Select the best classifier and train a stacked model¶

Although Naive Bayes has highest average recall, it is weird that the stacked model with NB and logistic regression only returns recall = 0.5. Additionally, my computer can not successfully generate the output using stacked model with Random Forest and LR. Therefore, I choose KNN, another model with relatively high recall value, as the level0.

In [82]:
def get_stacking():
    
    # define the base models
    level0 = list()   
    level0.append(("KNN", KNeighborsClassifier()))

    # define meta learner model
    level1 = LogisticRegression()
    
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    
    return model
In [83]:
stack_g06 = get_stacking()
In [84]:
scores_g06 = cross_val_score(stack_g06, X_train_std_g06, y_train_g06, cv=10, scoring='recall_macro', n_jobs=-1)
print('Average recall of Stacked model is: %.3f' % (scores.mean()), '\n')
Average recall of Stacked model is: 0.863 

In [86]:
# add the result of Stacked Model to the list

results_g06.append(scores)
names_g06.append("Stacked Model")
In [87]:
resultdf_g06 = pd.DataFrame(np.transpose(results_g06), columns = ["LR","KNN","Tree","SVM","NB","Random Forest","Bagging", "GBM", "XGB", "Stacked Model"])
resultdf_g06 = pd.melt(resultdf_g06.reset_index(), id_vars='index',value_vars=["LR","KNN","Tree","SVM","NB","Random Forest","Bagging", "GBM", "XGB", "Stacked Model"])
In [110]:
# create a new box plot

fig = px.box(resultdf_g06, x="variable", y="value",color="variable",points='all',
labels={"variable": "Machine Learning Model",
        "value": "Recall"
        },title="Model Performance")
fig.update_layout(
    title={
    'x':0.45,
    'xanchor': 'center',
    'yanchor': 'top'})
fig.show()
In [111]:
pio.write_image(fig, 'Boxplot2.png')

According to this plot, the performance of stacked model is just at the average level among all the models.

In [89]:
stack_g06.fit(X_train_std_g06, y_train_g06)
Out[89]:
StackingClassifier(cv=5, estimators=[('KNN', KNeighborsClassifier())],
                   final_estimator=LogisticRegression())
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StackingClassifier(cv=5, estimators=[('KNN', KNeighborsClassifier())],
                   final_estimator=LogisticRegression())
KNeighborsClassifier()
LogisticRegression()
In [90]:
y_pred_g06 = stack_g06.predict(X_test_std_g06)
In [93]:
# print the confusion matrix for stacked model on test data

from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from seaborn import set_palette

conf_matrix_g06 = confusion_matrix(y_test_g06, y_pred_g06)
print("Confusion Matrix:")
print(conf_matrix_g06)
Confusion Matrix:
[[56861     4]
 [   26    71]]

9. Export the Pickle model and import it back¶

In [94]:
import pickle
In [97]:
pkl_filename_g06 = "Assignment2_Pickle.pkl"

# export to a pickle file
with open(pkl_filename_g06, 'wb') as file:
    pickle.dump(stack, file)

# Load from file
with open(pkl_filename_g06, 'rb') as file:
    pickle_model = pickle.load(file)
In [98]:
# get the accuracy score
score_g06 = pickle_model.score(X_test_std_g06, y_test_g06)
In [100]:
print("Test score: {0:.2f} %".format(100 * score_g06))
Y_predict_g06 = pickle_model.predict(X_test_std_g06)
Test score: 99.95 %
In [101]:
# print the confusion matrix

conf_matrix_g06 = confusion_matrix(y_test_g06, Y_predict_g06)

print(conf_matrix_g06)
[[56861     4]
 [   26    71]]

10. Show both text and visual confusion matrices¶

In [102]:
conf_matrix_g06 = confusion_matrix(y_test_g06, Y_predict_g06)
print("Text Confusion Matrix: ")
print(conf_matrix_g06)
Text Confusion Matrix: 
[[56861     4]
 [   26    71]]
In [108]:
print("Visual Confusion Matrix: ")
fig, ax = plt.subplots()
cax = ax.matshow(conf_matrix_g06, cmap="coolwarm")

plt.title("Confusion Matrix Heatmap", pad=20)
fig.colorbar(cax)

# Set the labels
labels = ['Class: 0', 'Class: 1']
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
ax.xaxis.set_ticks_position('bottom')

# Annotate the cells with the values
for i in range(2):
    for j in range(2):
        ax.text(j, i, conf_matrix_g06[i, j], ha='center', va='center')

plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
Visual Confusion Matrix: 

From this confusion matrix, it is clear that this model predicted correctly for almost all the data of class: 0. Since this data is highly unbalanced, the accuracy is always near 100%. Therefore, I have mainly utilized recall as the evaluation metrix in this whole assignment. This result shows that the recall of this model is relatively good, but not so perfect.